Machine Learning Models for Geographic and Remote Work Analysis

KMeans Clustering, Salary Prediction, and Remote Work Classification

Author
Affiliations

Team 3

bu

Published

09-27-2025

Modified

December 12, 2025

x

1 Introduction

2 Data Cleaning & Preprocessing


title: “Data Analysis” subtitle: “Comprehensive Data Cleaning & Exploratory Analysis” author: - name: “Bingrui Qiao” - name: “Zhengyu Zhou” - name: “Junhao Wang” affiliations: “Boston University” bibliography: references.bib csl: csl/econometrica.csl format: html: toc: true number-sections: true execute: echo: true eval: true freeze: false error: false cache: false enabled: !expr (os.getenv(“CI”, “false”) != “true”) jupyter: python3 —


2.1 Data Cleaning & Preprocessing

In this section, we clean the Lightcast dataset, log each step, and save a reproducible cleaned CSV for downstream EDA.

import os, datetime
os.makedirs("logs", exist_ok=True)
with open("logs/_ping.txt", "w", encoding="utf-8") as f:
    f.write("hello @ " + str(datetime.datetime.now()) + "\n")
print("WROTE: logs/_ping.txt")

2.1.1 Setup & Load (clean version)

import os, datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import subprocess
import seaborn as sns


# Handle missingno (optional)
try:
    import missingno as msno
    HAS_MSNO = True
except ImportError:
    HAS_MSNO = False

# Paths
DATA_PATH  = "data/lightcast_job_postings.csv"
CLEAN_PATH = "data/cleaned_lightcast.csv"
LOG_PATH   = "logs/cleaning_log.txt"
FIG_MISS   = "figures/missing_values_heatmap.png"

# Ensure output dirs exist
os.makedirs("logs", exist_ok=True)
os.makedirs("figures", exist_ok=True)

# Logger
def log(msg: str):
    print(msg)
    with open(LOG_PATH, "a", encoding="utf-8") as f:
        f.write(msg.rstrip() + "\n")

# Start a fresh log
with open(LOG_PATH, "w", encoding="utf-8") as f:
    f.write("=== DATA CLEANING LOG START ===\n")

# Ping file (to confirm write permission)
with open("logs/_ping.txt", "w", encoding="utf-8") as f:
    f.write("hello from python @ " + str(datetime.datetime.now()) + "\n")

# AUTO-DOWNLOAD IF MISSING
if not os.path.exists(DATA_PATH):
    gdrive_url = "https://drive.google.com/uc?id=1V2GCHGt2dkFGqVBeoUFckU4IhUgk4ocQ"
    try:
        import gdown
        gdown.download(gdrive_url, DATA_PATH, quiet=False)
    except Exception as e:
        raise FileNotFoundError(f"Could not download dataset.\nError: {e}")


# Load data
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH}. Check path & working dir.")
df = pd.read_csv(DATA_PATH, low_memory=False, on_bad_lines="skip")
log(f"Loaded dataset → rows: {len(df):,}, cols: {df.shape[1]}")

2.1.2 Drop redundant/irrelevant columns

columns_to_drop = [
"ID","URL","ACTIVE_URLS","DUPLICATES","LAST_UPDATED_TIMESTAMP",
"NAICS2","NAICS3","NAICS4","NAICS5","NAICS6",
"SOC_2","SOC_3","SOC_5"
]
before_cols = df.shape[1]
df.drop(columns=columns_to_drop, inplace=True, errors="ignore")
after_cols = df.shape[1]
log(f"Dropped {before_cols - after_cols} columns; remaining columns: {after_cols}")

# Normalize names & basic types

df.columns = df.columns.str.strip().str.upper()

if "POSTED" in df.columns:
        df["POSTED"] = pd.to_datetime(df["POSTED"], errors="coerce")

if "SALARY" in df.columns:
        df["SALARY"] = pd.to_numeric(
            df["SALARY"].astype(str).str.replace(r"[^0-9.-]", "", regex=True),
                errors="coerce"
)

2.1.3 Visualize & handle missing values

# --- Missing values correlation heatmap (like your classmate) ---

cols_with_na = [c for c in df.columns if df[c].isna().any()]
sub = df[cols_with_na]

if len(cols_with_na) >= 2:
    
    if len(cols_with_na) > 25:
        top_na_cols = sub.isna().mean().sort_values(ascending=False).head(25).index
        sub = sub[top_na_cols]

    
    na_corr = sub.isna().astype(int).corr()

    
    mask = np.triu(np.ones_like(na_corr, dtype=bool))

    plt.figure(figsize=(11, 8))
    sns.heatmap(
        na_corr, mask=mask, cmap="coolwarm", vmin=-1, vmax=1,
        annot=True, fmt=".1f", linewidths=.5,
        cbar_kws={"shrink": .8}
    )
    plt.title("Missing Values Correlation Heatmap", fontsize=14)
    plt.xticks(rotation=45, ha="right", fontsize=8)
    plt.yticks(rotation=0, fontsize=8)
    plt.tight_layout()

    FIG_MISS = "figures/missing_values_corr_heatmap.png"
    plt.savefig(FIG_MISS, dpi=150)
    plt.show()
    
else:
    log("No columns with missing values; skipping heatmap.")

2.1.4 Remove duplicates

subset_cols = [c for c in ["TITLE","COMPANY","LOCATION","POSTED"] if c in df.columns]
before = len(df)
if subset_cols:
    df.drop_duplicates(subset=subset_cols, keep="first", inplace=True)
    after = len(df)
    log(f"Removed duplicates by {subset_cols}: {before - after} rows dropped; remaining: {after}")

2.1.5 Optional salary sanity filter

if "SALARY" in df.columns:
    bad = (df["SALARY"] < 1) | (df["SALARY"] > 1_000_000)
    n_bad = int(bad.sum())
    if n_bad > 0:
        df.loc[bad, "SALARY"] = np.nan
        med2 = df["SALARY"].median()
        df["SALARY"].fillna(med2, inplace=True)
        log(f"Clipped extreme Salary ({n_bad} rows) and refilled with median {med2:.2f}")

2.1.6 Save & summary

df.to_csv(CLEAN_PATH, index=False)
log(f"Saved cleaned dataset → {CLEAN_PATH}")

summary = f"Rows: {len(df):,}\nColumns: {df.shape[1]}\nSample columns: {list(df.columns)[:12]}"
print(summary)
log("✅ Cleaning pipeline finished successfully.")

2.2 Exploratory Data Analysis (EDA)

Exploratory Data Analysis (EDA) allows us to identify patterns and distributions in the job market dataset.
In this section, we focus on three aspects: 1. Job postings by industry
2. Salary distributions
3. Remote vs. on-site job proportions


2.2.1 Job Postings by Industry

Understanding industry demand helps reveal which sectors are most active in hiring.

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Load dataset
df = pd.read_csv("data/cleaned_lightcast.csv", low_memory=False)

# Ensure /figures directory exists
os.makedirs("figures", exist_ok=True)

# col
industry_col = "NAICS_2022_6_NAME"

# Clean up column names
df.columns = df.columns.str.strip()

#drop unclassified
df = df[~df["NAICS_2022_6_NAME"].str.lower().str.contains("unclassified", na=False)]

# Count top 10 industries
top_industries = df[industry_col].value_counts().head(10)


plt.figure(figsize=(10, 6))
sns.barplot(x=top_industries.values, y=top_industries.index, orient="h")
plt.title("Top 10 Industries by Job Postings")
plt.xlabel("Number of Job Postings")
plt.ylabel("Industry")
plt.tight_layout()
plt.savefig("figures/industry_postings.png", dpi=300)
plt.show()

A bar chart was chosen to visualize the number of job postings across different industries. This format makes it easy to compare industry demand and helps job seekers understand which sectors have the highest hiring activity.

Job demand is concentrated in tech and professional services: Custom Computer Programming Services leads, followed closely by Management Consulting, Employment Placement Agencies, and Computer Systems Design—each with ~4–5k postings. After these, volumes drop sharply to a second tier (Commercial Banking, CPA offices, Temporary Help, Health/Medical Insurance, Other Computer Services, Colleges/Universities) at ~1.3–2.0k. The prominence of staffing/placement agencies signals broad, economy-wide hiring, while the tech-heavy top ranks highlight sustained demand for digital and knowledge-worker skills.

2.2.2 Salary Distribution by Industry

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("data/cleaned_lightcast.csv", low_memory=False)

# Define column names
industry_col = "NAICS_2022_6_NAME"
salary_col = "SALARY"

# Filter and clean salary
df = df[df[salary_col] > 0]
#df = df[~df[industry_col].str.lower().str.contains("unclassified")]

# Select top 10 industries by posting count
top10 = df[industry_col].value_counts().head(10).index
df_top10 = df[df[industry_col].isin(top10)]

# --- Boxplot Visualization ---
palette = sns.color_palette("Spectral", n_colors=10)

plt.figure(figsize=(12, 8))

sns.boxplot(
    data=df_top10,
    y=industry_col,
    x=salary_col,
    palette=palette,
    showfliers=True,     # show outlier
    linewidth=1.2
)

# Add title & labels
plt.title("Salary Distribution by Industry (Top 10 Industries)", fontsize=18, weight="bold")
plt.xlabel("Salary (USD)", fontsize=14)
plt.ylabel("Industry", fontsize=14)

# Light grid background like example
sns.set_style("whitegrid")
plt.grid(axis="x", linestyle="--", alpha=0.3)

plt.tight_layout()

# Save figure
plt.savefig("figures/salary_distribution_by_industry_top10.png", dpi=300, bbox_inches="tight")
plt.show()
/tmp/ipykernel_5950/3831417689.py:25: FutureWarning:



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

Use a box chart to show the salary distribution of each major recruitment industry. Unlike bar charts, bar charts only show averages, while box charts can reveal the median, fluctuation range and abnormal values in each industry. This helps job seekers not only understand which industries have higher average salaries, but also the extent of salary fluctuations and the areas with the greatest potential for salary growth.

In various industries, some knowledge-intensive and service-intensive fields (such as unclassified industries and customized computer programming services) have relatively high salary levels and are more dispersed. The median salary in these areas is relatively high, and the upper tail is long, which indicates that there are a large number of highly paid positions. Management consulting, computer system design, and direct health and medical insurance industries also show a large range of remuneration, reflecting the wide range of remuneration within these professions. In contrast, industries such as colleges and universities, certified public accountants, commercial banks and software publishers have a tighter scope and fewer extreme outliers, indicating that the remuneration range of these industries is more standardized and there is less room for improvement at the top of the salary distribution.

2.2.3 Remote vs. On-Site Jobs

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

os.makedirs("figures", exist_ok=True)

df = pd.read_csv("data/cleaned_lightcast.csv", low_memory=False)

# Clean the REMOTE_TYPE_NAME column
df["REMOTE_TYPE_NAME"] = (
    df["REMOTE_TYPE_NAME"]
    .astype(str).str.strip().str.lower()
    .replace({"[none]": None, "none": None, "unknown": None, "nan": None, "na": None, "null": None, "": None})
)

# Count each type
remote_counts = df["REMOTE_TYPE_NAME"].value_counts()

# Visual
plt.figure(figsize=(6, 6))
plt.pie(
    remote_counts.values,
    labels=remote_counts.index,
    autopct="%1.1f%%",
    startangle=90,
    wedgeprops={"edgecolor": "white"}
)
plt.title("Remote vs. On-Site Job Distribution", fontsize=13)
plt.tight_layout()
plt.savefig("figures/remote_vs_onsite.png", dpi=300, bbox_inches="tight")
plt.show()

A pie chart was selected to display the proportions of job types (remote, hybrid, and on-site). This format offers an intuitive visual summary, helping job seekers understand the flexibility of job opportunities in 2024.

The job market here is overwhelmingly remote: nearly four out of five postings (78.4%) are fully remote, while another 14.4% offer hybrid options. Only 7.3% require full on-site work. This mix signals strong employer flexibility and broad access to roles regardless of location, with hybrid emerging as a meaningful—but secondary—model. For candidates, remote-first skills (self-management, async collaboration) are likely at a premium, while fully on-site opportunities are comparatively scarce.

import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

os.makedirs("figures", exist_ok=True)

df = pd.read_csv("data/cleaned_lightcast.csv", low_memory=False)

# =========================================
# 1️⃣ Clean REMOTE_TYPE_NAME
# =========================================
df["REMOTE_TYPE_NAME"] = (
    df["REMOTE_TYPE_NAME"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({
        "[none]": None,
        "none": None,
        "unknown": None,
        "nan": None,
        "na": None,
        "null": None,
        "": None
    })
)

# =========================================
# 2️⃣ Re-group into Remote / Hybrid / Onsite
# =========================================
def map_remote(x):
    if pd.isna(x):
        return "Onsite"
    if "remote" in x and "hybrid" in x:
        return "Hybrid"
    if x == "hybrid remote":
        return "Hybrid"
    if x == "remote":
        return "Remote"
    if x == "not remote":
        return "Onsite"
    return "Onsite"

df["REMOTE_GROUP"] = df["REMOTE_TYPE_NAME"].apply(map_remote)

print(df["REMOTE_GROUP"].value_counts())

# =========================================
# 3️⃣ Visualization (Pie Chart)
# =========================================
remote_counts = df["REMOTE_GROUP"].value_counts()

plt.figure(figsize=(6, 6))
plt.pie(
    remote_counts.values,
    labels=remote_counts.index,
    autopct="%1.1f%%",
    startangle=90,
    wedgeprops={"edgecolor": "white"}
)
plt.title("Remote vs. Hybrid vs. Onsite Job Distribution", fontsize=13)
plt.tight_layout()
plt.savefig("figures/remote_vs_onsite.png", dpi=300, bbox_inches="tight")
plt.show()
REMOTE_GROUP
Onsite    55304
Remote    11743
Hybrid     2151
Name: count, dtype: int64

3 Skill Gap Analysis

4 Create a team-based skill dataframe

import pandas as pd

skills_data = {
    "Name": ["Bingrui Qiao", "Zhengyu Zhou", "Junhao Wang"],
    "Python": [2, 2, 3],
    "SQL": [2, 2, 3],
    "Machine Learning": [1, 1, 1],
    "Cloud Computing": [2, 2, 3],
    "Docker": [1, 1, 1],
    "AWS": [2, 3, 2]
}

df_skills = pd.DataFrame(skills_data)
df_skills.set_index("Name", inplace=True)
df_skills
Python SQL Machine Learning Cloud Computing Docker AWS
Name
Bingrui Qiao 2 2 1 2 1 2
Zhengyu Zhou 2 2 1 2 1 3
Junhao Wang 3 3 1 3 1 2

5 Visualizing Skill Gaps

import seaborn as sns
import matplotlib.pyplot as plt

#visual
sns.set_theme(style="whitegrid", font_scale=1.1, rc={
    "axes.facecolor": "white",
    "figure.facecolor": "white",
    "axes.edgecolor": "gray",
    "grid.color": "lightgray"
})

# Color choice
cmap = sns.diverging_palette(220, 20, as_cmap=True)

# visual
plt.figure(figsize=(9, 6))
ax = sns.heatmap(
    df_skills,
    annot=True,
    fmt=".0f",
    cmap=cmap,
    linewidths=0.7,
    cbar_kws={'label': 'Skill Level'},
    square=True
)

# Title
plt.title("Team Skill Levels Heatmap", fontsize=15, weight="bold", pad=20)
plt.xlabel("Skill Domain", fontsize=12)
plt.ylabel("Team Member", fontsize=12)

# layout
plt.xticks(rotation=30, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()

#save
plt.savefig("figures/Skill_Gap_Analysis.png", dpi=300, bbox_inches="tight")
plt.show()

6 Compare skill

# Load data
df = pd.read_csv("data/cleaned_lightcast.csv")

# Count keyword occurrences
top_skills = df_skills.columns.tolist()
job_text = df["BODY"].fillna("")
skill_counts = {s: job_text.str.contains(s, case=False).sum() for s in top_skills}

# Append demand row
df_skills.loc["Market Demand"] = [skill_counts[s] for s in top_skills]
df_skills
Python SQL Machine Learning Cloud Computing Docker AWS
Name
Bingrui Qiao 2 2 1 2 1 2
Zhengyu Zhou 2 2 1 2 1 3
Junhao Wang 3 3 1 3 1 2
Market Demand 11782 23202 3972 1302 688 14243

7 Visual heatmap

import seaborn as sns
import matplotlib.pyplot as plt

# Same style
sns.set_theme(style="whitegrid", font_scale=1.1, rc={
    "axes.facecolor": "white",
    "figure.facecolor": "white",
    "axes.edgecolor": "gray",
    "grid.color": "lightgray"
})


cmap = sns.diverging_palette(220, 20, as_cmap=True)

# Team Skill Levels
plt.figure(figsize=(8, 2.5))
plt.imshow(df_skills.iloc[:-1], aspect="auto", cmap=cmap)
plt.colorbar(label="Skill Level (1–5)")
plt.yticks(range(len(df_skills.index)-1), df_skills.index[:-1], fontsize=10)
plt.xticks(range(len(df_skills.columns)), df_skills.columns, rotation=30, ha="right", fontsize=9)
plt.title("Team Skill Levels", fontsize=13, weight="bold")
plt.tight_layout()
plt.savefig("figures/Team_Skill_Level.png", dpi=300, bbox_inches="tight")
plt.show()

# Market Demand Heatmap
plt.figure(figsize=(8, 2))
plt.imshow([df_skills.loc["Market Demand"]], aspect="auto", cmap=cmap)
plt.colorbar(label="Market Demand Count")
plt.yticks([0], ["Market Demand"], fontsize=10)
plt.xticks(range(len(df_skills.columns)), df_skills.columns, rotation=30, ha="right", fontsize=9)
plt.title("Market Demand", fontsize=13, weight="bold")
plt.tight_layout()
plt.savefig("figures/Market_Demand.png", dpi=300, bbox_inches="tight")
plt.show()

8 Machine Learning Models

9 Classification: Remote vs Non-Remote Jobs

# Import necessary libraries
import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Set plotly renderer
pio.renderers.default = "notebook+notebook_connected+vscode"

# Initialize Spark Session
df = pd.read_csv("data/lightcast_job_postings.csv", low_memory=False)

# Print schema and preview first few rows
print("--- Diagnostic check: Schema and sample rows ---")
print(df.info())
print(df.head())
--- Diagnostic check: Schema and sample rows ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72498 entries, 0 to 72497
Columns: 131 entries, ID to NAICS_2022_6_NAME
dtypes: float64(38), object(93)
memory usage: 72.5+ MB
None
                                         ID LAST_UPDATED_DATE  \
0  1f57d95acf4dc67ed2819eb12f049f6a5c11782c          9/6/2024   
1  0cb072af26757b6c4ea9464472a50a443af681ac          8/2/2024   
2  85318b12b3331fa490d32ad014379df01855c557          9/6/2024   
3  1b5c3941e54a1889ef4f8ae55b401a550708a310          9/6/2024   
4  cb5ca25f02bdf25c13edfede7931508bfd9e858f         6/19/2024   

      LAST_UPDATED_TIMESTAMP  DUPLICATES    POSTED    EXPIRED  DURATION  \
0  2024-09-06 20:32:57.352 Z         0.0  6/2/2024   6/8/2024       6.0   
1  2024-08-02 17:08:58.838 Z         0.0  6/2/2024   8/1/2024       NaN   
2  2024-09-06 20:32:57.352 Z         1.0  6/2/2024   7/7/2024      35.0   
3  2024-09-06 20:32:57.352 Z         1.0  6/2/2024  7/20/2024      48.0   
4  2024-06-19 07:00:00.000 Z         0.0  6/2/2024  6/17/2024      15.0   

             SOURCE_TYPES                                        SOURCES  \
0       [\n  "Company"\n]                        [\n  "brassring.com"\n]   
1     [\n  "Job Board"\n]                            [\n  "maine.gov"\n]   
2     [\n  "Job Board"\n]                           [\n  "dejobs.org"\n]   
3     [\n  "Job Board"\n]  [\n  "disabledperson.com",\n  "dejobs.org"\n]   
4  [\n  "FreeJobBoard"\n]                       [\n  "craigslist.org"\n]   

                                                 URL  ... NAICS_2022_2  \
0  [\n  "https://sjobs.brassring.com/TGnewUI/Sear...  ...         44.0   
1   [\n  "https://joblink.maine.gov/jobs/1085740"\n]  ...         56.0   
2  [\n  "https://dejobs.org/dallas-tx/data-analys...  ...         52.0   
3  [\n  "https://www.disabledperson.com/jobs/5948...  ...         52.0   
4  [\n  "https://modesto.craigslist.org/sls/77475...  ...         99.0   

                                   NAICS_2022_2_NAME NAICS_2022_3  \
0                                       Retail Trade        441.0   
1  Administrative and Support and Waste Managemen...        561.0   
2                              Finance and Insurance        524.0   
3                              Finance and Insurance        522.0   
4                              Unclassified Industry        999.0   

                              NAICS_2022_3_NAME NAICS_2022_4  \
0               Motor Vehicle and Parts Dealers       4413.0   
1           Administrative and Support Services       5613.0   
2     Insurance Carriers and Related Activities       5242.0   
3  Credit Intermediation and Related Activities       5221.0   
4                         Unclassified Industry       9999.0   

                                   NAICS_2022_4_NAME  NAICS_2022_5  \
0  Automotive Parts, Accessories, and Tire Retailers       44133.0   
1                                Employment Services       56132.0   
2  Agencies, Brokerages, and Other Insurance Rela...       52429.0   
3                   Depository Credit Intermediation       52211.0   
4                              Unclassified Industry       99999.0   

                            NAICS_2022_5_NAME NAICS_2022_6  \
0  Automotive Parts and Accessories Retailers     441330.0   
1                     Temporary Help Services     561320.0   
2          Other Insurance Related Activities     524291.0   
3                          Commercial Banking     522110.0   
4                       Unclassified Industry     999999.0   

                            NAICS_2022_6_NAME  
0  Automotive Parts and Accessories Retailers  
1                     Temporary Help Services  
2                            Claims Adjusting  
3                          Commercial Banking  
4                       Unclassified Industry  

[5 rows x 131 columns]
# Take subset of relevant columns
relevant_columns = [
    "SALARY", "MIN_YEARS_EXPERIENCE", "EDUCATION_LEVELS_NAME",
    "EMPLOYMENT_TYPE_NAME", "REMOTE_TYPE_NAME", "DURATION", 
    "IS_INTERNSHIP", "COMPANY_IS_STAFFING", "STATE_NAME", "CITY_NAME",
    "MSA_NAME", "ONET", "ONET_NAME", "NAICS2_NAME", "TITLE_NAME"
]

df_analysis = df[relevant_columns].copy()

df_analysis.head()
SALARY MIN_YEARS_EXPERIENCE EDUCATION_LEVELS_NAME EMPLOYMENT_TYPE_NAME REMOTE_TYPE_NAME DURATION IS_INTERNSHIP COMPANY_IS_STAFFING STATE_NAME CITY_NAME MSA_NAME ONET ONET_NAME NAICS2_NAME TITLE_NAME
0 NaN 2.0 [\n "Bachelor's degree"\n] Full-time (> 32 hours) [None] 6.0 False False Arkansas El Dorado, AR El Dorado, AR 15-2051.01 Business Intelligence Analysts Retail Trade Enterprise Analysts
1 NaN 3.0 [\n "No Education Listed"\n] Full-time (> 32 hours) Remote NaN False True Maine Augusta, ME Augusta-Waterville, ME 15-2051.01 Business Intelligence Analysts Administrative and Support and Waste Managemen... Oracle Consultants
2 NaN 5.0 [\n "Bachelor's degree"\n] Full-time (> 32 hours) [None] 35.0 False False Texas Dallas, TX Dallas-Fort Worth-Arlington, TX 15-2051.01 Business Intelligence Analysts Finance and Insurance Data Analysts
3 NaN 3.0 [\n "No Education Listed"\n] Full-time (> 32 hours) [None] 48.0 False False Arizona Phoenix, AZ Phoenix-Mesa-Chandler, AZ 15-2051.01 Business Intelligence Analysts Finance and Insurance Management Analysts
4 92500.0 NaN [\n "No Education Listed"\n] Part-time / full-time [None] 15.0 False False California Modesto, CA Modesto, CA 15-2051.01 Business Intelligence Analysts Unclassified Industry Unclassified
df_analysis["REMOTE_TYPE_NAME"] = df_analysis["REMOTE_TYPE_NAME"].astype(str).str.strip()

remote_col = df_analysis["REMOTE_TYPE_NAME"].replace({"[None]": None})

df_analysis["REMOTE_GROUP"] = np.select(
    [
        remote_col.eq("Remote"),
        remote_col.eq("Hybrid Remote"),
        remote_col.eq("Not Remote"),
        remote_col.isna()
    ],
    ["Remote", "Hybrid", "Onsite", "Onsite"],
    default="Onsite"
)

df_analysis.drop(columns=["REMOTE_TYPE_NAME"], inplace=True)


# EMPLOYMENT_GROUP
emp_col = df_analysis["EMPLOYMENT_TYPE_NAME"].astype(str).str.strip()

df_analysis["EMPLOYMENT_GROUP"] = np.select(
    [
        emp_col.eq("Full-time (> 32 hours)"),
        emp_col.eq("Part-time (⤠32 hours)"),
        emp_col.eq("Part-time / full-time"),
        emp_col.isna()
    ],
    ["Full-time", "Part-time", "Flexible", "Full-time"],
    default="Flexible"
)

df_analysis.drop(columns=["EMPLOYMENT_TYPE_NAME"], inplace=True)

# MIN_YEARS_EXPERIENCE & group

df_analysis["MIN_YEARS_EXPERIENCE"] = df_analysis["MIN_YEARS_EXPERIENCE"].fillna(0)

# make sure it is numerical
df_analysis["MIN_YEARS_EXPERIENCE"] = pd.to_numeric(
    df_analysis["MIN_YEARS_EXPERIENCE"],
    errors="coerce"
).fillna(0)

exp = df_analysis["MIN_YEARS_EXPERIENCE"]

df_analysis["MIN_YEARS_EXPERIENCE_GROUP"] = np.select(
    [
        (exp >= 0) & (exp <= 1),
        (exp > 1) & (exp <= 3),
        (exp > 3) & (exp <= 5),
        (exp > 5) & (exp <= 10),
        (exp > 10)
    ],
    ["Internship/Entry Level", "Junior", "Mid-Level", "Senior", "Expert"],
    default="Internship/Entry Level"
)


# DURATION:null value & 0 -> 1

dur = df_analysis["DURATION"]
df_analysis["DURATION"] = (
    dur.fillna(1)
       .replace(0, 1)
)


# clean EDUCATION_LEVELS_NAME -> EDUCATION_LEVELS_CLEAN

edu_raw = df_analysis["EDUCATION_LEVELS_NAME"].fillna("")
edu_clean = (
    edu_raw
    .astype(str)
    .str.replace(r"[\[\]\n]", "", regex=True)
    .str.strip()
)
df_analysis["EDUCATION_LEVELS_CLEAN"] = edu_clean
df_analysis.drop(columns=["EDUCATION_LEVELS_NAME"], inplace=True)


# Fill in the blank STATE_NAME/CITY_NAME


df_analysis["STATE_NAME"] = df_analysis["STATE_NAME"].fillna("Unknown")
df_analysis["CITY_NAME"] = df_analysis["CITY_NAME"].fillna("Unknown")


# ONET/ONET_NAME/NAICS2_NAME null value handling

df_analysis["ONET"] = df_analysis["ONET"].fillna("00-0000.00")
df_analysis["ONET_NAME"] = df_analysis["ONET_NAME"].fillna("Unknown")
df_analysis["NAICS2_NAME"] = df_analysis["NAICS2_NAME"].fillna("Unknown")

print(df_analysis.head())
    SALARY  MIN_YEARS_EXPERIENCE  DURATION IS_INTERNSHIP COMPANY_IS_STAFFING  \
0      NaN                   2.0       6.0         False               False   
1      NaN                   3.0       1.0         False                True   
2      NaN                   5.0      35.0         False               False   
3      NaN                   3.0      48.0         False               False   
4  92500.0                   0.0      15.0         False               False   

   STATE_NAME      CITY_NAME                         MSA_NAME        ONET  \
0    Arkansas  El Dorado, AR                    El Dorado, AR  15-2051.01   
1       Maine    Augusta, ME           Augusta-Waterville, ME  15-2051.01   
2       Texas     Dallas, TX  Dallas-Fort Worth-Arlington, TX  15-2051.01   
3     Arizona    Phoenix, AZ        Phoenix-Mesa-Chandler, AZ  15-2051.01   
4  California    Modesto, CA                      Modesto, CA  15-2051.01   

                        ONET_NAME  \
0  Business Intelligence Analysts   
1  Business Intelligence Analysts   
2  Business Intelligence Analysts   
3  Business Intelligence Analysts   
4  Business Intelligence Analysts   

                                         NAICS2_NAME           TITLE_NAME  \
0                                       Retail Trade  Enterprise Analysts   
1  Administrative and Support and Waste Managemen...   Oracle Consultants   
2                              Finance and Insurance        Data Analysts   
3                              Finance and Insurance  Management Analysts   
4                              Unclassified Industry         Unclassified   

  REMOTE_GROUP EMPLOYMENT_GROUP MIN_YEARS_EXPERIENCE_GROUP  \
0       Onsite        Full-time                     Junior   
1       Remote        Full-time                     Junior   
2       Onsite        Full-time                  Mid-Level   
3       Onsite        Full-time                     Junior   
4       Onsite         Flexible     Internship/Entry Level   

  EDUCATION_LEVELS_CLEAN  
0    "Bachelor's degree"  
1  "No Education Listed"  
2    "Bachelor's degree"  
3  "No Education Listed"  
4  "No Education Listed"  
# Prepare binary classification data (Remote vs Onsite only)

df_binary = df_analysis[df_analysis["REMOTE_GROUP"].isin(["Remote", "Onsite"])].copy()

print(df_binary["REMOTE_GROUP"].value_counts())
#
REMOTE_GROUP
Onsite    57741
Remote    12497
Name: count, dtype: int64
# Feature Engineering - Encode categorical variables


# Define categorical and numeric columns
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

categorical_cols = [
    "EMPLOYMENT_GROUP",
    "MIN_YEARS_EXPERIENCE_GROUP",
    "EDUCATION_LEVELS_CLEAN",
    "STATE_NAME",
    "NAICS2_NAME"
]
numeric_cols = ["MIN_YEARS_EXPERIENCE", "DURATION"]

# 1. create label

label_encoder = LabelEncoder()
df_binary["label"] = label_encoder.fit_transform(df_binary["REMOTE_GROUP"])

# Take a look at the category order
print("Label mapping:", dict(zip(label_encoder.classes_,
                                 label_encoder.transform(label_encoder.classes_))))

# 2. create features(indexer + onehot + VectorAssembler)
X = df_binary[categorical_cols + numeric_cols]
y = df_binary["label"]

# ColumnTransformer 
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols),
    ]
)

X_features = preprocess.fit_transform(X)

print("--- Prepared Data Preview (pandas + sklearn) ---")
print("X_features type:", type(X_features))
print("X_features shape:", X_features.shape)
print("First 5 labels:", y.iloc[:5].tolist())
print("First 5 REMOTE_GROUP:", df_binary["REMOTE_GROUP"].iloc[:5].tolist())


if hasattr(X_features, "toarray"):
    X_dense = X_features.toarray()
else:
    X_dense = np.asarray(X_features)


df_prepared = pd.DataFrame({
    "REMOTE_GROUP": df_binary["REMOTE_GROUP"].reset_index(drop=True),
    "label": y.reset_index(drop=True),     # df_binary["label"]
    "features": list(X_dense)             # One for each line numpy array
})

print("\n--- df_prepared preview ---")
print(df_prepared.head())
print(df_prepared.shape)
Label mapping: {'Onsite': np.int64(0), 'Remote': np.int64(1)}
--- Prepared Data Preview (pandas + sklearn) ---
X_features type: <class 'scipy.sparse._csr.csr_matrix'>
X_features shape: (70238, 112)
First 5 labels: [0, 1, 0, 0, 0]
First 5 REMOTE_GROUP: ['Onsite', 'Remote', 'Onsite', 'Onsite', 'Onsite']

--- df_prepared preview ---
  REMOTE_GROUP  label                                           features
0       Onsite      0  [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
1       Remote      1  [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
2       Onsite      0  [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...
3       Onsite      0  [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...
4       Onsite      0  [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
(70238, 3)
from sklearn.model_selection import train_test_split


train_data, test_data = train_test_split(
    df_prepared,
    test_size=0.3,        # 30% for test
    random_state=42,      # seed=42
)

print(f"Training set size: {len(train_data)}")
print(f"Test set size: {len(test_data)}")
Training set size: 49166
Test set size: 21072
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# break out X / y
X_train = np.stack(train_data["features"].to_numpy())   # shape: (n_train, n_features)
y_train = train_data["label"].to_numpy()

X_test  = np.stack(test_data["features"].to_numpy())
y_test  = test_data["label"].to_numpy()

# Logistic Regression
lr_model = LogisticRegression(
    max_iter=1000,      
    n_jobs=-1
)
lr_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,   # = numTrees
    random_state=42,    # = seed
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

print("Both models trained successfully!")
Both models trained successfully!

10 Predict

import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


# First extract X_test/y_test from test_data
X_test = np.stack(test_data["features"].to_numpy())
y_test = test_data["label"].to_numpy()


# Predict

# Logistic Regression
y_pred_lr = lr_model.predict(X_test)

# Obtain the positive class probability using predict_proba and calculate the AUC
y_proba_lr = lr_model.predict_proba(X_test)[:, 1]

# Random Forest
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]


# Evaluation:Accuracy / F1 / AUC-ROC
from sklearn.metrics import f1_score

print("LOGISTIC REGRESSION RESULTS")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(f"F1 Score (weighted): {f1_score(y_test, y_pred_lr, average='weighted'):.4f}")
print(f"AUC-ROC:  {roc_auc_score(y_test, y_proba_lr):.4f}")

print("RANDOM FOREST RESULTS")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"F1 Score (weighted): {f1_score(y_test, y_pred_rf, average='weighted'):.4f}")
print(f"AUC-ROC:  {roc_auc_score(y_test, y_proba_rf):.4f}")
LOGISTIC REGRESSION RESULTS
Accuracy: 0.8250
F1 Score (weighted): 0.7533
AUC-ROC:  0.6368
RANDOM FOREST RESULTS
Accuracy: 0.8349
F1 Score (weighted): 0.8168
AUC-ROC:  0.7293
# Step 7: Confusion Matrix
from sklearn.metrics import confusion_matrix

print("\n=== Logistic Regression Confusion Matrix (2x2) ===")
print("(rows = true label, cols = predicted label)")

cm_lr = confusion_matrix(y_test, y_pred_lr, labels=[0, 1])
cm_lr_df = pd.DataFrame(
    cm_lr,
    index=["True 0 (Onsite)", "True 1 (Remote)"],
    columns=["Pred 0 (Onsite)", "Pred 1 (Remote)"]
)
print(cm_lr_df)

print("\n=== Random Forest Confusion Matrix (2x2) ===")
cm_rf = confusion_matrix(y_test, y_pred_rf, labels=[0, 1])
cm_rf_df = pd.DataFrame(
    cm_rf,
    index=["True 0 (Onsite)", "True 1 (Remote)"],
    columns=["Pred 0 (Onsite)", "Pred 1 (Remote)"]
)
print(cm_rf_df)

=== Logistic Regression Confusion Matrix (2x2) ===
(rows = true label, cols = predicted label)
                 Pred 0 (Onsite)  Pred 1 (Remote)
True 0 (Onsite)            17274               59
True 1 (Remote)             3628              111

=== Random Forest Confusion Matrix (2x2) ===
                 Pred 0 (Onsite)  Pred 1 (Remote)
True 0 (Onsite)            16372              961
True 1 (Remote)             2517             1222

11 confusion matrix

import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# Calculate the confusion matrix (in the order of [0, 1])
cm_lr = confusion_matrix(y_test, y_pred_lr, labels=[0, 1])
cm_rf = confusion_matrix(y_test, y_pred_rf, labels=[0, 1])

print("Logistic Regression CM:\n", cm_lr)
print("Random Forest CM:\n", cm_rf)

# The z (list of list) required for converting to plotly
lr_z = cm_lr.tolist()
rf_z = cm_rf.tolist()

lr_x = ['Onsite', 'Remote']  # Predicted label
lr_y = ['Onsite', 'Remote']  # True label

# Visual
fig = make_subplots(
    rows=1, cols=2, 
    subplot_titles=('Logistic Regression', 'Random Forest')
)

# Logistic Regression heatmap
fig.add_trace(
    go.Heatmap(
        z=lr_z,
        x=lr_x,
        y=lr_y,
        colorscale='Blues',
        text=lr_z,
        texttemplate="%{text}",
        showscale=False
    ),
    row=1, col=1
)

# Random Forest heatmap
fig.add_trace(
    go.Heatmap(
        z=rf_z,
        x=lr_x,
        y=lr_y,
        colorscale='Blues',
        text=rf_z,
        texttemplate="%{text}",
        showscale=True
    ),
    row=1, col=2
)

fig.update_layout(
    title_text="Remote vs Onsite Job Classification - Confusion Matrix Comparison",
    title_font_size=16,
    height=400,
    width=900
)

fig.update_xaxes(title_text="Predicted Label")
fig.update_yaxes(title_text="True Label")

fig.write_image("figures/confusion_matrix_comparison.jpg")
Logistic Regression CM:
 [[17274    59]
 [ 3628   111]]
Random Forest CM:
 [[16372   961]
 [ 2517  1222]]

12 KMeans Clustering using ONET as reference label

# KMeans Clustering using ONET as reference label

# Check ONET distribution first
onet_dist_df = (
    df_analysis["ONET_NAME"]
    .value_counts()
    .reset_index()
    .rename(columns={"index": "ONET_NAME", "ONET_NAME": "count"})
)

print(onet_dist_df.head(20))
                            count  count
0  Business Intelligence Analysts  72454
1                         Unknown     44

13 Prepare features for KMeans clustering

# Prepare features for KMeans clustering
# Define columns
cluster_categorical_cols = [
    "EMPLOYMENT_GROUP",
    "MIN_YEARS_EXPERIENCE_GROUP",
    "EDUCATION_LEVELS_CLEAN",
    "STATE_NAME",
    "REMOTE_GROUP",
]
cluster_numeric_cols = ["MIN_YEARS_EXPERIENCE", "DURATION"]

# Prepare the feature table X (only including the columns used for clustering)
X_cluster = df_analysis[cluster_categorical_cols + cluster_numeric_cols]

# 3. ColumnTransformer = StringIndexer + OneHotEncoder + VectorAssembler
cluster_preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cluster_categorical_cols),
        ("num", "passthrough", cluster_numeric_cols),
    ]
)

cluster_pipeline = Pipeline(
    steps=[
        ("preprocess", cluster_preprocess)
    ]
)

# Fit and merge to generate the features matrix
X_cluster_features = cluster_pipeline.fit_transform(X_cluster)

if hasattr(X_cluster_features, "toarray"):
    X_cluster_dense = X_cluster_features.toarray()
else:
    X_cluster_dense = np.asarray(X_cluster_features)

print("--- Clustering Features Shape ---")
print(X_cluster_dense.shape)

# 5. ONET_NAME -> ONET_label
onet_le = LabelEncoder()
df_analysis = df_analysis.copy()
df_analysis["ONET_NAME"] = df_analysis["ONET_NAME"].fillna("Unknown")
df_analysis["ONET_label"] = onet_le.fit_transform(df_analysis["ONET_NAME"])

print("ONET label mapping (first few):")
for name, idx in list(zip(onet_le.classes_, range(len(onet_le.classes_))))[:10]:
    print(idx, "->", name)

# 6. group df_cluster:ONET_NAME | ONET_label | features
df_cluster = df_analysis.copy()
df_cluster["features"] = list(X_cluster_dense)

print("--- Clustering Data Prepared (pandas) ---")
print(df_cluster[["ONET_NAME", "ONET_label", "features"]].head())
print(df_cluster.shape)
--- Clustering Features Shape ---
(72498, 94)
ONET label mapping (first few):
0 -> Business Intelligence Analysts
1 -> Unknown
--- Clustering Data Prepared (pandas) ---
                        ONET_NAME  ONET_label  \
0  Business Intelligence Analysts           0   
1  Business Intelligence Analysts           0   
2  Business Intelligence Analysts           0   
3  Business Intelligence Analysts           0   
4  Business Intelligence Analysts           0   

                                            features  
0  [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...  
1  [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...  
2  [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...  
3  [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...  
4  [1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
(72498, 18)

14 Find optimal K using Elbow Method

# Find optimal K using Elbow Method
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Extract the feature matrix from the df_cluster
X_cluster = np.stack(df_cluster["features"].to_numpy())   # shape: (n_samples, n_features)

k_values = [3, 5, 7, 10, 15, 20]
silhouette_scores = []

print("--- Finding Optimal K ---")
for k in k_values:
    kmeans = KMeans(
        n_clusters=k,
        random_state=42,    # seed = 42
        n_init="auto"      
    )
    labels = kmeans.fit_predict(X_cluster)

    score = silhouette_score(X_cluster, labels)
    silhouette_scores.append(score)

    print(f"K = {k}: Silhouette Score = {score:.4f}")
--- Finding Optimal K ---
K = 3: Silhouette Score = 0.5132
K = 5: Silhouette Score = 0.4411
K = 7: Silhouette Score = 0.3804
K = 10: Silhouette Score = 0.3590
K = 15: Silhouette Score = 0.3334
K = 20: Silhouette Score = 0.2926

15 Extract the feature matrix

import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Extract the feature matrix
X_cluster = np.stack(df_cluster["features"].to_numpy())  # shape: (n_samples, n_features)

# Train the final KMeans model
optimal_k = 3

kmeans_final = KMeans(
    n_clusters=optimal_k,
    random_state=42,   # seed = 42
    n_init="auto"
)

cluster_labels = kmeans_final.fit_predict(X_cluster)  

# Write the cluster back to the DataFrame
df_clustered = df_cluster.copy()
df_clustered["cluster"] = cluster_labels   

# 4. calculate silhouette
final_score = silhouette_score(X_cluster, cluster_labels)
print(f"\n--- Final KMeans Model (K={optimal_k}) ---")
print(f"Silhouette Score: {final_score:.4f}")

# Cluster Distribution
print("\n--- Cluster Distribution ---")
print(
    df_clustered["cluster"]
    .value_counts()
    .sort_index()
)

--- Final KMeans Model (K=3) ---
Silhouette Score: 0.5132

--- Cluster Distribution ---
cluster
0    19473
1    39891
2    13134
Name: count, dtype: int64

16 Analyze clusters vs ONET reference labels

# Cross-tabulation of clusters and ONET
print("--- Top ONET categories in each cluster ---")
for cluster_id in range(optimal_k):
    print(f"\n=== Cluster {cluster_id} ===")
    top_onet_df = (
        df_clustered[df_clustered["cluster"] == cluster_id]["ONET_NAME"]
        .value_counts()
        .reset_index()
        .rename(columns={"index": "ONET_NAME", "ONET_NAME": "count"})
        .head(5)
    )
    print(top_onet_df)
--- Top ONET categories in each cluster ---

=== Cluster 0 ===
                            count  count
0  Business Intelligence Analysts  19472
1                         Unknown      1

=== Cluster 1 ===
                            count  count
0  Business Intelligence Analysts  39851
1                         Unknown     40

=== Cluster 2 ===
                            count  count
0  Business Intelligence Analysts  13131
1                         Unknown      3

17 Analyze cluster characteristics

print("--- Cluster Characteristics ---")

# === Remote work distribution by cluster ===
print("\n=== Remote Work by Cluster ===")
remote_by_cluster = (
    df_clustered
    .groupby(["cluster", "REMOTE_GROUP"])
    .size()
    .reset_index(name="count")
    .sort_values(["cluster", "REMOTE_GROUP"])
)
print(remote_by_cluster)

remote_pivot = remote_by_cluster.pivot(
    index="cluster",
    columns="REMOTE_GROUP",
    values="count"
).fillna(0).astype(int)
print("\nRemote work pivot table:")
print(remote_pivot)

# === Experience level by cluster ===
print("\n=== Experience Level by Cluster ===")
exp_by_cluster = (
    df_clustered
    .groupby(["cluster", "MIN_YEARS_EXPERIENCE_GROUP"])
    .size()
    .reset_index(name="count")
    .sort_values(["cluster", "MIN_YEARS_EXPERIENCE_GROUP"])
)
print(exp_by_cluster)

exp_pivot = exp_by_cluster.pivot(
    index="cluster",
    columns="MIN_YEARS_EXPERIENCE_GROUP",
    values="count"
).fillna(0).astype(int)
print("\nExperience level pivot table:")
print(exp_pivot)

# === Top states by cluster ===
print("\n=== Top States by Cluster ===")
for cluster_id in range(optimal_k):
    print(f"\n--- Cluster {cluster_id} Top States ---")
    top_states = (
        df_clustered[df_clustered["cluster"] == cluster_id]["STATE_NAME"]
        .value_counts()
        .head(5)
    )
    print(top_states)
--- Cluster Characteristics ---

=== Remote Work by Cluster ===
   cluster REMOTE_GROUP  count
0        0       Hybrid    540
1        0       Onsite  15203
2        0       Remote   3730
3        1       Hybrid   1177
4        1       Onsite  32518
5        1       Remote   6196
6        2       Hybrid    543
7        2       Onsite  10020
8        2       Remote   2571

Remote work pivot table:
REMOTE_GROUP  Hybrid  Onsite  Remote
cluster                             
0                540   15203    3730
1               1177   32518    6196
2                543   10020    2571

=== Experience Level by Cluster ===
    cluster MIN_YEARS_EXPERIENCE_GROUP  count
0         0                     Expert    922
1         0     Internship/Entry Level   7249
2         0                     Junior   3628
3         0                  Mid-Level   3894
4         0                     Senior   3780
5         1                     Expert   1847
6         1     Internship/Entry Level  14474
7         1                     Junior   7179
8         1                  Mid-Level   7681
9         1                     Senior   8710
10        2                     Expert    705
11        2     Internship/Entry Level   4953
12        2                     Junior   2438
13        2                  Mid-Level   2348
14        2                     Senior   2690

Experience level pivot table:
MIN_YEARS_EXPERIENCE_GROUP  Expert  Internship/Entry Level  Junior  Mid-Level  \
cluster                                                                         
0                              922                    7249    3628       3894   
1                             1847                   14474    7179       7681   
2                              705                    4953    2438       2348   

MIN_YEARS_EXPERIENCE_GROUP  Senior  
cluster                             
0                             3780  
1                             8710  
2                             2690  

=== Top States by Cluster ===

--- Cluster 0 Top States ---
STATE_NAME
Texas         2098
California    2041
Florida       1069
Virginia      1000
New York       801
Name: count, dtype: int64

--- Cluster 1 Top States ---
STATE_NAME
Texas         4519
California    3839
Illinois      2417
Virginia      2082
New York      1969
Name: count, dtype: int64

--- Cluster 2 Top States ---
STATE_NAME
Texas         1450
California    1204
Florida        636
New York       571
Virginia       554
Name: count, dtype: int64

18 Visualization: Elbow Plot

import pandas as pd
import plotly.express as px

# obtain the cluster frequency
cluster_counts_series = (
    df_clustered["cluster"]
    .value_counts()
    .sort_index()
)

# Clearly create a DataFrame with only two columns, "cluster" and "count"
cluster_counts = pd.DataFrame({
    "cluster": cluster_counts_series.index,
    "count": cluster_counts_series.values
})

print(cluster_counts)  # Check if the column names are ['cluster', 'count']

# Visual
fig = px.bar(
    cluster_counts,
    x="cluster",
    y="count",
    title="KMeans Clustering: Distribution of Jobs Across Clusters",
    labels={"cluster": "Cluster", "count": "Number of Jobs"},
    template="plotly_white",
    color="count",
    color_continuous_scale="Blues",
)

fig.update_layout(font=dict(family="Roboto", size=14))

fig.write_image("figures/kmeans_elbow_plot.jpg")
fig
   cluster  count
0        0  19473
1        1  39891
2        2  13134

19 Visualization: Cluster Distribution

import pandas as pd
import plotly.express as px

# Visualization: Cluster Distribution

# Get cluster counts(
mapping = {0: 2, 1: 0, 2: 1}

df_clustered["cluster_spark"] = df_clustered["cluster"].map(mapping)

cluster_counts = (
    df_clustered["cluster_spark"]
    .value_counts()
    .sort_index()
)


fig = px.bar(
    x=cluster_counts.index,
    y=cluster_counts.values,
    color=cluster_counts.values,          
    color_continuous_scale="Blues",      
    labels={"x": "Cluster", "y": "Number of Jobs", "color": "Number of Jobs"},
    title="KMeans Clustering: Distribution of Jobs Across Clusters",
    template="plotly_white",
)

fig.update_layout(font=dict(family="Roboto", size=14))

fig.write_image("figures/kmeans_cluster_distribution.jpg")
fig

20 Choropleth Map: Remote Work Percentage by State

import pandas as pd
import numpy as np

# Calculate remote work percentage by state (pandas version)

# 1.STATE_NAME & REMOTE_GROUP
state_remote = (
    df_analysis
    .groupby(["STATE_NAME", "REMOTE_GROUP"])
    .size()
    .reset_index(name="count")
)

# 2. pivot:index = STATE_NAME,columns = REMOTE_GROUP(Onsite / Remote / Hybrid)
state_df = (
    state_remote
    .pivot(index="STATE_NAME", columns="REMOTE_GROUP", values="count")
    .fillna(0)
    .reset_index()
)

# Ensure that the "Hybrid" column exists
if "Hybrid" not in state_df.columns:
    state_df["Hybrid"] = 0

# Calculate Total and Remote_Pct
state_df["Total"] = state_df["Onsite"] + state_df["Remote"] + state_df["Hybrid"]
state_df["Remote_Pct"] = (state_df["Remote"] / state_df["Total"] * 100).round(2)

print("--- State Remote Work Data ---")
print(state_df.head(10))
--- State Remote Work Data ---
REMOTE_GROUP   STATE_NAME  Hybrid  Onsite  Remote   Total  Remote_Pct
0                 Alabama    15.0   567.0   108.0   690.0       15.65
1                  Alaska     7.0   132.0    97.0   236.0       41.10
2                 Arizona    43.0  1349.0   246.0  1638.0       15.02
3                Arkansas    12.0   464.0   108.0   584.0       18.49
4              California   262.0  5766.0  1056.0  7084.0       14.91
5                Colorado    55.0  1141.0   259.0  1455.0       17.80
6             Connecticut    41.0   660.0   162.0   863.0       18.77
7                Delaware    15.0   330.0    93.0   438.0       21.23
8                 Florida    72.0  3060.0   513.0  3645.0       14.07
9                 Georgia    64.0  2169.0   425.0  2658.0       15.99

21 Add state abbreviations for Plotly map

# State name to abbreviation mapping
state_abbrev = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
    'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
    'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
    'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
    'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
    'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY',
    'District of Columbia': 'DC'
}

# Add state abbreviation column
state_df['State_Abbrev'] = state_df['STATE_NAME'].map(state_abbrev)

# Remove rows without valid state abbreviation (e.g., "Unknown")
state_df_clean = state_df[state_df['State_Abbrev'].notna()]

print(f"States with data: {len(state_df_clean)}")
print(state_df_clean[['STATE_NAME', 'State_Abbrev', 'Total', 'Remote_Pct']].head(10))
States with data: 50
REMOTE_GROUP   STATE_NAME State_Abbrev   Total  Remote_Pct
0                 Alabama           AL   690.0       15.65
1                  Alaska           AK   236.0       41.10
2                 Arizona           AZ  1638.0       15.02
3                Arkansas           AR   584.0       18.49
4              California           CA  7084.0       14.91
5                Colorado           CO  1455.0       17.80
6             Connecticut           CT   863.0       18.77
7                Delaware           DE   438.0       21.23
8                 Florida           FL  3645.0       14.07
9                 Georgia           GA  2658.0       15.99

22 Choropleth Map with State Labels showing Remote Percentage

import plotly.graph_objects as go

# Choropleth Map with State Labels showing Remote Percentage

fig = go.Figure(data=go.Choropleth(
    locations=state_df_clean['State_Abbrev'],
    z=state_df_clean['Remote_Pct'],
    locationmode='USA-states',
    colorscale='Blues',
    colorbar_title='Remote %',
    text=state_df_clean['State_Abbrev'] + '<br>' + state_df_clean['Remote_Pct'].astype(str) + '%',
    hovertemplate='<b>%{text}</b><br>Total Jobs: %{customdata[0]}<br>Remote Jobs: %{customdata[1]}<extra></extra>',
    customdata=state_df_clean[['Total', 'Remote']].values,
    marker_line_color='white',
    marker_line_width=1
))

# Add state abbreviations with percentages as annotations
fig.add_scattergeo(
    locations=state_df_clean['State_Abbrev'],
    locationmode='USA-states',
    text=state_df_clean['Remote_Pct'].apply(lambda x: f'{x:.0f}%'),
    mode='text',
    textfont=dict(size=8, color='black', family='Arial Black'),
    showlegend=False
)

fig.update_layout(
    title_text='Remote Work Opportunity by State (% of Jobs that are Remote)',
    title_font_size=18,
    geo=dict(
        scope='usa',
        projection_type='albers usa',
        showlakes=True,
        lakecolor='rgb(255, 255, 255)',
        bgcolor='rgba(0,0,0,0)'
    ),
    template='plotly_white',
    font=dict(family="Roboto", size=14),
    height=600,
    width=1000
)

fig.write_image("figures/choropleth_remote_work_with_labels.jpg")
print("Enhanced choropleth map saved!")
fig
Enhanced choropleth map saved!

23 Choropleth Map: Total Job Postings by State (with labels)

import plotly.graph_objects as go

# Choropleth Map: Total Job Postings by State (with labels)

# Get top 15 states by total jobs for labeling
top_states = state_df_clean.nlargest(15, 'Total')['State_Abbrev'].tolist()

fig = go.Figure(data=go.Choropleth(
    locations=state_df_clean['State_Abbrev'],
    z=state_df_clean['Total'],
    locationmode='USA-states',
    colorscale='Greens',
    colorbar_title='Total Jobs',
    hovertemplate='<b>%{location}</b><br>Total Jobs: %{z:,}<br>Remote: %{customdata[0]:,}<br>Onsite: %{customdata[1]:,}<extra></extra>',
    customdata=state_df_clean[['Remote', 'Onsite']].values,
    marker_line_color='white',
    marker_line_width=1.5
))

# Add labels for top states (format large numbers with K)
top_state_df = state_df_clean[state_df_clean['State_Abbrev'].isin(top_states)].copy()
top_state_df['Total_Label'] = top_state_df['Total'].apply(
    lambda x: f'{x/1000:.1f}K' if x >= 1000 else str(int(x))
)

fig.add_scattergeo(
    locations=top_state_df['State_Abbrev'],
    locationmode='USA-states',
    text=top_state_df['Total_Label'],
    mode='text',
    textfont=dict(size=10, color='darkgreen', family='Arial Black'),
    showlegend=False
)

fig.update_layout(
    title_text='Total Job Postings by State<br><sup>Labels shown for top 15 states by job volume</sup>',
    title_font_size=16,
    geo=dict(
        scope='usa',
        projection_type='albers usa',
        showlakes=True,
        lakecolor='rgb(255, 255, 255)'
    ),
    template='plotly_white',
    font=dict(family="Roboto", size=14),
    height=600,
    width=1000
)


fig.write_image("figures/choropleth_total_jobs_with_labels.jpg")
print("Enhanced total jobs choropleth map saved!")
fig
Enhanced total jobs choropleth map saved!

24 Bar Chart: Top 10 States by Remote Work Percentage

# Filter states with at least 100 jobs for meaningful comparison
state_df_filtered = state_df_clean[state_df_clean['Total'] >= 100]

# Sort by remote percentage
top_remote_states = state_df_filtered.nlargest(10, 'Remote_Pct')

fig = px.bar(
    top_remote_states,
    x='STATE_NAME',
    y='Remote_Pct',
    color='Remote_Pct',
    color_continuous_scale='Blues',
    title='Top 10 States with Highest Remote Work Opportunities',
    labels={'STATE_NAME': 'State', 'Remote_Pct': 'Remote Work %'},
    text='Remote_Pct'
)

fig.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
fig.update_layout(
    template='plotly_white',
    font=dict(family="Roboto", size=14),
    xaxis_tickangle=-45,
    showlegend=False
)

fig.write_image("figures/top10_remote_states.jpg")
print("Top 10 remote states bar chart saved!")
fig
Top 10 remote states bar chart saved!